import pandas as pd
from shutil import copyfile
import matplotlib.pyplot as plt #for graphs
import pandas as pd #for storing data
import numpy as np #for math operations
import seaborn as sns #for visualisation
import matplotlib.ticker as ticker #to change graph axes
from datetime import date #date manipulation
from sklearn.preprocessing import scale #standardization


df = pd.read_excel("Euro.xlsx", header = None)
df.index = df[0]
df = df.drop(0, axis =1)
df


# find all the stocks in the dataset
lst = list(df[df.isnull().all(axis=1)].index)
del lst[2 - 1::2]
stocks = []
for i in lst:
    stocks.append(" ".join(i.split(" ")[:-1]))


# save date from columns name
time = df.iloc[1]


df = df.replace(0,np.nan)


def get_factor(name):
    sub = df.loc[name]
    sub.index = stocks
    sub.columns = time
    return sub


date_df = get_factor('Date')


date_df = date_df.drop(date_df.columns[0], axis=1)


boole = ~date_df.isnull().any(axis=1).values


# Number of deleted stocks
len(boole) - boole.sum()

163


# Remove the stock with different time series from stock list
stocks = date_df[boole].index


# Chenge the get_factor function in order to remove stock with different time frequency
def get_factor(name):
    sub = df.loc[name][boole]
    sub.index = stocks
    sub.columns = time
    return sub


tot_factors = ['PE_RATIO', 'FIVE_YR_AVG_PRICE_EARNINGS', 'T12M_DIL_PE_CONT_OPS', '10_YEAR_MOVING_AVERAGE_PE', 'PX_TO_TANG_BV_PER_SH',
 'CURRENT_EV_TO_12M_SALES', 'CURRENT_EV_TO_T12M_EBITDA', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'T12M_DIL_EPS_CONT_OPS',
 'TRAIL_12M_EBITDA_PER_SHARE', 'TRAIL_12M_SALES_PER_SH', 'NET_DEBT_PER_SHARE', 'TANG_BOOK_VAL_PER_SH',
 'NORMALIZED_ACCRUALS_CF_METHOD', 'EBITDA_MARGIN', 'EBITDA_MARGIN_3YR_AVG', 'RSI_14D', 'RSI_30D', 'RSI_9D', 'OPERATING_ROIC',
 'EQY_DPS_NET_5YR_GROWTH', 'EQY_REC_CONS', 'BEST_EPS', 'WACC_COST_EQUITY', 'NORMALIZED_ROE', '5YR_AVG_RETURN_ON_EQUITY',
 'CUR_MKT_CAP', 'PX_TO_BOOK_RATIO']


nan_factors = []
for factor_name in tot_factors:
    factor = get_factor(factor_name)
    for col in range(1,len(time),3):
        # For a factor to be mantained it has to have at least 50 non-nan stocks
        if factor.iloc[:,col].notnull().sum() < 50:
            nan_factors.append(factor_name)
            break


nan_factors

['T12M_DIL_PE_CONT_OPS',
 '10_YEAR_MOVING_AVERAGE_PE',
 'T12M_DIL_EPS_CONT_OPS',
 'BEST_EPS']


tot_factors = [x for x in tot_factors if x not in nan_factors]


price = get_factor("PX_LAST")


def returns(row):
    row = row.astype('float64')
    return np.log(row) - np.log(row.shift(1))


log_returns = price.apply(returns, axis = 1)


log_returns.head()


def multiplicator(row):
    return row/row.shift(1)
price_mul = price.apply(multiplicator, axis=1)


# We initialize an empty dataframe
sub = pd.DataFrame(columns=time, index = stocks)
i=2
# Remove the transaction cost in the first iteration of wealth
wealth = 600000 * 0.998
# To calculate the benchmark we select only the stocks that have no null information in the price df
sel_stocks = price[~price.isnull().any(axis=1)].index
# Allocate wealth equally through the selected stocks
sub.loc[sel_stocks, sub.columns[1]] = wealth/len(sel_stocks)
# Calculate the value of the portfolio at each iteration of time
while i < len(time):
    sub.iloc[:,i] = sub.iloc[:,i-1] * price_mul.iloc[:,i]
    i += 1
# Build output dataframe
bench = pd.DataFrame(columns=time[1:])
# Insert value information
bench.loc['value'] = sub.sum()[1:]
# Insert return information
bench.loc['returns'] = (bench.loc['value'].astype('float') - bench.loc['value'].shift(1).astype('float'))/bench.loc['value'].shift(1).astype('float')
bench.loc['cum_returns'] = (bench.loc['value'].astype('float') - bench.loc['value'][1])/bench.loc['value'][1]
bench.loc['log_returns'] = np.log(bench.loc['value'].astype('float')) - np.log(bench.loc['value'].shift(1).astype('float'))


benchmark = bench


benchmark


plt.figure().set_figwidth(12)

plt.subplot(1,2,1)
benchmark.loc['log_returns'].plot(label = 'Benchmark')
plt.title("Benchmark log returns")
plt.ylabel("log return")
plt.grid()

plt.subplot(1,2,2)
benchmark.loc['value'].plot(label = 'Benchmark')
plt.title("Benchmark value")
plt.ylabel("value (in million)")
plt.grid()

plt.show()


def uniPortfolio(data, holding, worst = False, wealth = 600000, n_stocks=15, INtrans_cost = 0.002):
    # Initiate dataset
    sub = pd.DataFrame(columns=time, index = stocks)
    # Set i = 1, we want to start at month 2, so that we can use the information of the factor at month 1
    i=1
    # Remove the transaction cost in the first iteration of wealth
    wealth = wealth * (1 - INtrans_cost)
    # Loop through i. i is going to represent the month in which the selected stocks are modified based on the factor
    while i < len(time):
        # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
        if i != 1:
            portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
            # Overwrite the wealth as the portfolio total value
            wealth = portfT.sum()
        # Create a copy of the factor
        factor = data
        # From this new copy, select only the rows of the stocks for which we have the price_mul information
        if i+holding < len(time):
            factor = factor.loc[price_mul.iloc[:,range(i,i+ holding)].notnull().all(axis=1),:]
        else:
            factor = factor.loc[price_mul.iloc[:,i:].notnull().all(axis=1),:]
        # Select stocks based on factor
        sel_stocks = factor.sort_values(factor.columns[i-1], ascending=worst).index[:n_stocks]
        # Allocate wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth/n_stocks
        # If we are not in the first month, than we have to compute the transaction costs
        if i != 1:
            # We compute the transaction costs based on the difference between the portfolio before and after reallocation
            trans_cost = abs(sub.iloc[:,i] - portfT).sum() * INtrans_cost
            # Remove the transaction cost from total wealth
            wealth = wealth - trans_cost
            # Overwrite allocation of wealth equally through the selected stocks
            sub.loc[sel_stocks, sub.columns[i]] = wealth/n_stocks
        # J is going to loop through the months in which there is no reallocation
        for j in range(1,holding):
            # Break the j loop if the months are finished
            if i + j > len(time)-1:
                break
            # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
            sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
        # Increment i by the holding period before next iteration
        i += holding
    # Since we started from i=1, the first element of the subset is going to be 0, so we are going to remove it
    # Build output dataframe
    portf = pd.DataFrame(columns=time[1:])
    # Insert value information
    portf.loc['value'] = sub.sum()[1:]
    # Insert return information
    portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
    portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
    portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))
    return portf


def IR(port):
    num2 = benchmark.loc['log_returns']
    num1 = port.loc['log_returns']
    num = (num1 - num2).mean() * 12
    den = np.std(port.loc['log_returns'] - benchmark.loc['log_returns']) * np.sqrt(12)
    return num/den


# Import the data from yahoo finance
import yfinance as yf
IRX = yf.download('^IRX','2002-12-31','2012-03-30', interval='1mo')
IRX = IRX['Close']

[*********************100%***********************]  1 of 1 completed


IRX = pd.DataFrame(IRX).transpose()
IRX.index = ["value"]
IRX.columns = time
IRX


IRX.loc['log_returns'] = np.log(IRX.loc['value'].astype('float')) - np.log(IRX.loc['value'].shift(1).astype('float'))


def Sharpe(port):
    num2 = IRX.loc['log_returns'][1:]
    num1 = port.loc['log_returns']
    num = (num1 - num2).mean()
    den = np.std(port.loc['log_returns'])
    return num/den


def Sortino(port):
    num2 = IRX.loc['log_returns'][1:]
    num1 = port.loc['log_returns']
    num = (num1 - num2).mean()
    den = np.std(port.loc['log_returns', port.loc['log_returns'] < port.loc['log_returns'].mean()])
    return num/den


# We are going to store the data in a list of two dictionary
comparison = [{},{}]
worst_list = [False, True]
# We are going to use the average of this four different holding periods
hold_list = [3,6,12,111]
# The first dictionary is going to contain the metrics relative to the portfolio that use the stocks
# with higher value of the factor. The second will use the lower.
for worst in range(len(worst_list)):
    # Loop through the factor
    for factor in tot_factors:
        # get the data of the factor
        sub = get_factor(factor)
        lst = []
        for i in range(4):
            hold = hold_list[i]
            port = uniPortfolio(sub, hold, n_stocks=100, worst = worst_list[worst], INtrans_cost = 0)
            lst.append(IR(port))
        # The resulting IR of the portfolio is computed as the average IR between the different holding periods
        comparison[worst][factor] = sum(lst) / len(lst)


X_axis = np.arange(len(comparison[0]))

plt.bar(X_axis -0.2, comparison[0].values(), 0.4, label = 'Top')
plt.bar(X_axis +0.2, comparison[1].values(), 0.4, label = 'Bottom')
plt.axhline(y = 0, color = 'black')

  
plt.xticks(X_axis, comparison[0].keys(), rotation = 90)
plt.xlabel("Factors")
plt.ylabel("Information Ratio")
plt.title("Average IR of the Top and Bottom portfolios for each factor")
plt.legend()
plt.show()


diz_factors = {}
for el in range(len(comparison[0])):
    diz_factors[list(comparison[0].keys())[el]] = list(comparison[0].values())[el] < list(comparison[1].values())[el]
diz_factors

{'PE_RATIO': True,
 'FIVE_YR_AVG_PRICE_EARNINGS': True,
 'PX_TO_TANG_BV_PER_SH': True,
 'CURRENT_EV_TO_12M_SALES': True,
 'CURRENT_EV_TO_T12M_EBITDA': True,
 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA': True,
 'TRAIL_12M_EBITDA_PER_SHARE': True,
 'TRAIL_12M_SALES_PER_SH': True,
 'NET_DEBT_PER_SHARE': True,
 'TANG_BOOK_VAL_PER_SH': True,
 'NORMALIZED_ACCRUALS_CF_METHOD': True,
 'EBITDA_MARGIN': True,
 'EBITDA_MARGIN_3YR_AVG': True,
 'RSI_14D': True,
 'RSI_30D': True,
 'RSI_9D': False,
 'OPERATING_ROIC': False,
 'EQY_DPS_NET_5YR_GROWTH': True,
 'EQY_REC_CONS': False,
 'WACC_COST_EQUITY': True,
 'NORMALIZED_ROE': False,
 '5YR_AVG_RETURN_ON_EQUITY': True,
 'CUR_MKT_CAP': True,
 'PX_TO_BOOK_RATIO': True}


# The information is goint to be stored in a list of 4 dictionaries
hold_comparison = [{},{},{},{}]
# hold_list contains the holding periods we are going to compare
hold_list = [3,6,12,111]
# For each holding period and for each factor, compute the resulting IR of the portfolio
for i in range(4):
    hold = hold_list[i]
    for factor in diz_factors.keys():
        sub = get_factor(factor)
        port = uniPortfolio(sub, hold, n_stocks=15, worst = diz_factors[factor])
        hold_comparison[i][factor + "_" + str(hold)] = IR(port)


lst_holdComp = []
# We store the value information in a list containing 4 lists
for i in range(4):
    lst_holdComp.append(list(hold_comparison[i].values()))


import warnings
warnings.filterwarnings("ignore")


fig = plt.figure(figsize =(7, 4))
# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xticklabels(hold_list)
# Creating plot
bp = ax.boxplot(lst_holdComp, showmeans=True)
plt.axhline(y = 0, color = 'black')
plt.title("Boxplot of the IR achieved for every holding period")
plt.ylabel("Information Ratio")
plt.xlabel("Holding Period")
plt.show()


# Initialize an empty rank dataframe
rank = pd.DataFrame(columns = ['Information', 'Sharpe', 'Sortino'], index = list(diz_factors.keys()))


# For each factor, build a portfolio and insert in the dataframe its resulting metrics
for factor in list(diz_factors.keys()):
    port = uniPortfolio(get_factor(factor), 12, n_stocks=15, worst=diz_factors[factor])
    rank.loc[factor, 'Information'] = IR(port)
    rank.loc[factor, 'Sharpe'] = Sharpe(port)
    rank.loc[factor, 'Sortino'] = Sortino(port)


# Order the resulting portfolios by their resulting Information Ratio
rank = rank.sort_values(by='Information', ascending=False)
rank


plt.figure(figsize =(10, 6))

benchmark.loc['value'].plot(label = 'benchmark', color='black')
uniPortfolio(get_factor('CUR_MKT_CAP'), 12, worst=diz_factors['CUR_MKT_CAP']).loc['value'].plot(label = 'CUR_MKT_CAP', color='red')
uniPortfolio(get_factor('PE_RATIO'), 12, worst=diz_factors['PE_RATIO']).loc['value'].plot(label = 'PE_RATIO')
uniPortfolio(get_factor('CURRENT_EV_TO_12M_SALES'), 12, worst=diz_factors['CURRENT_EV_TO_12M_SALES']).loc['value'].plot(label = 'CURRENT_EV_TO_12M_SALES')
uniPortfolio(get_factor('FIVE_YEAR_AVG_EV_TO_T12_EBITDA'), 12, worst=diz_factors['FIVE_YEAR_AVG_EV_TO_T12_EBITDA']).loc['value'].plot(label = 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA')
uniPortfolio(get_factor('CURRENT_EV_TO_T12M_EBITDA'), 12, worst=diz_factors['CURRENT_EV_TO_T12M_EBITDA']).loc['value'].plot(label = 'CURRENT_EV_TO_T12M_EBITDA')
plt.title("Current Market Cap vs other 4 best univariate portfolios")
plt.ylabel("value (in tens of millions)")
plt.grid()
plt.legend()
plt.show()


plt.figure().set_figwidth(12)

plt.subplot(1,2,1)
benchmark.loc['log_returns'].plot(color = 'black', label='benchmark')
uniPortfolio(get_factor('CUR_MKT_CAP'), 12, worst=diz_factors['CUR_MKT_CAP'], n_stocks=1).loc['log_returns'].plot(color='red')
plt.axvspan(455, 458, color="blue", alpha=0.2)
plt.axvspan(493, 498, color="blue", alpha=0.2)
plt.ylabel("log returns")
plt.grid()

plt.subplot(1,2,2)
benchmark.loc['value'].plot(color = 'black', label='benchmark')
uniPortfolio(get_factor('CUR_MKT_CAP'), 12, worst=diz_factors['CUR_MKT_CAP'], n_stocks=1).loc['value'].plot(color='red', label='CurMktCap')
plt.grid()
plt.ylabel("Value (in tens of millions)")
plt.axvspan(455, 458, color="blue", alpha=0.2)
plt.axvspan(493, 498, color="blue", alpha=0.2)

plt.suptitle("Cur Mkt Cap with one asset")
plt.show()


factor = get_factor('CUR_MKT_CAP')
year = 2003
for col in list(range(0,111,12)):
    non_nan_stocks = price.iloc[:,col:col+11][~price.iloc[:,col:col+11].isnull().any(axis=1)].index
    print(year, ": ", list(factor.loc[non_nan_stocks].iloc[:,col].sort_values(ascending=True).index)[0])
    year += 1

2003 :  WDI GR
2004 :  H4G GR
2005 :  ABE1 GR
2006 :  LCA1 GR
2007 :  MSU GR
2008 :  EUZ GR
2009 :  SIS GR
2010 :  SIS GR
2011 :  PRC FP
2012 :  FNM IM


plt.figure(figsize =(10, 6))

price.loc['MSU GR'].plot(label = 'MSU GR')
price.loc['PRC FP'].plot(label = 'PRC FP')
plt.axvspan(455, 458, color="blue", alpha=0.2)
plt.axvspan(493, 498, color="blue", alpha=0.2)
plt.ylabel("price")
plt.title("prices of MSU GRU and PRC FP")
plt.legend()
plt.grid()
plt.show()


uni_pe = uniPortfolio(get_factor('PE_RATIO'), 12, worst=diz_factors['PE_RATIO'])
uni_evtosales = uniPortfolio(get_factor('CURRENT_EV_TO_12M_SALES'), 12, worst=diz_factors['CURRENT_EV_TO_12M_SALES'])
uni_avgEbitda = uniPortfolio(get_factor('FIVE_YEAR_AVG_EV_TO_T12_EBITDA'), 12, worst=diz_factors['FIVE_YEAR_AVG_EV_TO_T12_EBITDA'])
uni_currEbitda = uniPortfolio(get_factor('CURRENT_EV_TO_T12M_EBITDA'), 12, worst=diz_factors['CURRENT_EV_TO_T12M_EBITDA'])
uni_pxToBook = uniPortfolio(get_factor('PX_TO_BOOK_RATIO'), 12, worst=diz_factors['PX_TO_BOOK_RATIO'])


plt.figure(figsize =(10, 6))

benchmark.loc['value'].plot(label = 'benchmark', color='black')
uni_pe.loc['value'].plot(label = 'PE_RATIO', color='C0')
uni_evtosales.loc['value'].plot(label = 'CURRENT_EV_TO_12M_SALES', color='C1')
uni_avgEbitda.loc['value'].plot(label = 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', color='C2')
uni_currEbitda.loc['value'].plot(label = 'CURRENT_EV_TO_T12M_EBITDA', color='C3')
uni_pxToBook.loc['value'].plot(label = 'PX_TO_BOOK_RATIO', color='C4')

plt.text(508, uni_pe.loc['value'][-1], 'IR: ' + str(round(IR(uni_pe),2)), color='C0')
plt.text(508, uni_evtosales.loc['value'][-1], 'IR: ' + str(round(IR(uni_evtosales),2)), color='C1')
plt.text(508, uni_avgEbitda.loc['value'][-1] +200000, 'IR: ' + str(round(IR(uni_avgEbitda),2)), color='C2')
plt.text(508, uni_currEbitda.loc['value'][-1] -200000, 'IR: ' + str(round(IR(uni_currEbitda),2)), color='C3')
plt.text(508, uni_pxToBook.loc['value'][-1], 'IR: ' + str(round(IR(uni_pxToBook),2)), color='C4')

plt.title("Comparison between the 5 best univariate portfolios")
plt.ylabel("value (in tens of millions)")
plt.legend()
plt.grid()
plt.show()


def seqPortfolio(data, holding, worst = None, wealth = 600000):
    # build the "worst" list
    if worst is None:
        worst = []
        for factor in range(len(data)):
            worst.append(diz_factors[data[factor]])
    # Initiate dataset
    sub = pd.DataFrame(columns=time, index = stocks)
    # Set i = 1, we want to start at month 2, so that we can use the information of the factor at month 1
    i=1
    # Remove the transaction cost in the first iteration of wealth
    wealth = wealth * 0.998
    # Loop through i. i is going to represent the month in which the selected stocks are modified based on the factor
    while i < len(time):
        # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
        if i != 1:
            portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
            # Overwrite the wealth as the portfolio total value
            wealth = portfT.sum()
        # Create a copy of the factors
        factor1 = get_factor(data[0])
        factor2 = get_factor(data[1])
        factor3 = get_factor(data[2])
        # From this new copy, select only the rows of the stocks for which we have the price_mul information
        factor1 = factor1.loc[price_mul.iloc[:,i].notnull(),:]
        # Select stocks based on factor
        sel_stocks = factor1.sort_values(factor1.columns[i-1], ascending=worst[0]).index[:round(len(factor1)*0.3)]
        factor2 = factor2.loc[sel_stocks]
        sel_stocks = factor2.sort_values(factor2.columns[i-1], ascending=worst[1]).index[:round(len(factor2)*0.3)]
        factor3 = factor3.loc[sel_stocks]
        sel_stocks = factor3.sort_values(factor3.columns[i-1], ascending=worst[2]).index[:15]
        # Allocate wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
        # If we are not in the first month, than we have to compute the transaction costs
        if i != 1:
            # We compute the transaction costs based on the difference between the portfolio before and after reallocation
            trans_cost = abs(sub.iloc[:,i] - portfT).sum() * 0.002
            # Remove the transaction cost from total wealth
            wealth = wealth - trans_cost
            # Overwrite allocation of wealth equally through the selected stocks
            sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
        # J is going to loop through the months in which there is no reallocation
        for j in range(1,holding):
            # Break the j loop if the months are finished
            if i + j > len(time)-1:
                break
            # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
            sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
        # Increment i by the holding period before next iteration
        i += holding
    # Since we started from i=1, the first element of the subset is going to be 0, so we are going to remove it
    # Build output dataframe
    portf = pd.DataFrame(columns=time[1:])
    # Insert value information
    portf.loc['value'] = sub.sum()[1:]
    # Insert return information    
    portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
    portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
    portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))
    return portf


portf_seq_top3 = seqPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA'], 12)
portf_seq_top3


portf_seq_top3_inv = seqPortfolio(['FIVE_YEAR_AVG_EV_TO_T12_EBITDA','CURRENT_EV_TO_12M_SALES','PE_RATIO'], 12)
portf_seq_top3


plt.figure(figsize =(10, 6))
uni_pe.loc['value'].plot(label = 'PE_RATIO', color='C0', linestyle = 'dashed')
portf_seq_top3.loc['value'].plot(label = 'Portfolio Top3', color='C1')
portf_seq_top3_inv.loc['value'].plot(label = 'Portfolio Top3 Inv', color='C2')
benchmark.loc['value'].plot(label = 'Benchmark', color = 'black')
plt.grid()
plt.title("Comparison between sequential portfolios")
plt.ylabel("Value (in millions)")
plt.text(508, uni_pe.loc['value'][-1], 'IR: ' + str(round(IR(uni_pe),2)), color='C0')
plt.text(508, portf_seq_top3.loc['value'][-1], 'IR: ' + str(round(IR(portf_seq_top3),2)), color='C1')
plt.text(508, portf_seq_top3_inv.loc['value'][-1], 'IR: ' + str(round(IR(portf_seq_top3_inv),2)), color='C2')
plt.legend()
plt.show()


# Initialize an empty rank dataframe
rank_seq = pd.DataFrame(columns = ['Information', 'Sharpe', 'Sortino'],
                    index = ['portf_seq_top3', 'portf_seq_top3_inv'])


# For each portfolio insert in the dataframe its resulting metrics
port_list = [portf_seq_top3, portf_seq_top3_inv]
for i in range(2):
    rank_seq.iloc[i]['Information'] = IR(port_list[i])
    rank_seq.iloc[i]['Sharpe'] = Sharpe(port_list[i])
    rank_seq.iloc[i]['Sortino'] = Sortino(port_list[i])


# Order the resulting portfolios by their resulting Information Ratio
rank_seq


def simPortfolio(data, holding, wealth = 600000, n_stocks=15):
    # Initialize worst list
    worst = []
    for factor in range(len(data)):
        worst.append(diz_factors[data[factor]])
    # Initiate dataset
    sub = pd.DataFrame(columns=time, index = stocks)
    # Set i = 1, we want to start at month 2, so that we can use the information of the factor at month 1
    i=1
    # Remove the transaction cost in the first iteration of wealth
    wealth = wealth * 0.998
    # Loop through i. i is going to represent the month in which the selected stocks are modified based on the factor
    while i < len(time):
        # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
        if i != 1:
            portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
            # Overwrite the wealth as the portfolio total value
            wealth = portfT.sum()
        # Create a copy of the factors
        fac = []
        for el in range(len(data)):
            fac.append(get_factor(data[el]).loc[price_mul.iloc[:,i].notnull(),:])
        # Initialize a df that is going to contain the z scores
        sub_factor = pd.DataFrame(columns=range(len(data)+1), index = fac[0].index)
        for el in range(len(data)):
            # Scale every column
            new_col = scale(fac[el].iloc[:,i-1])
            # If we have to get the bottom value stocks, we multiplicate the corresponding column for -1
            if worst[el]:
                new_col = new_col * -1
            sub_factor[el] = new_col
        # The last column of the df contains the average of the z score
        sub_factor.iloc[:,-1] = sub_factor.mean(axis=1)
        # We select the best stocks based on this last column
        sel_stocks = sub_factor.sort_values(sub_factor.columns[-1], ascending=False).index[:n_stocks]
        # Allocate wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
        # If we are not in the first month, than we have to compute the transaction costs
        if i != 1:
            # We compute the transaction costs based on the difference between the portfolio before and after reallocation
            trans_cost = abs(sub.iloc[:,i] - portfT).sum() * 0.002
            # Remove the transaction cost from total wealth
            wealth = wealth - trans_cost
            # Overwrite allocation of wealth equally through the selected stocks
            sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
        # J is going to loop through the months in which there is no reallocation
        for j in range(1,holding):
            # Break the j loop if the months are finished
            if i + j > len(time)-1:
                break
            # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
            sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
        # Increment i by the holding period before next iteration
        i += holding
    # Since we started from i=1, the first element of the subset is going to be 0, so we are going to remove it
    # Build output dataframe
    portf = pd.DataFrame(columns=time[1:])
    # Insert value information
    portf.loc['value'] = sub.sum()[1:]
    # Insert return information    
    portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
    portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
    portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))
    return portf


portf_sim_top3 = simPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA'], 12)
portf_sim_top3


portf_sim_top6 = simPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA',
                              'CURRENT_EV_TO_T12M_EBITDA', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12)
portf_sim_top6


plt.figure(figsize =(10, 6))

uni_pe.loc['value'].plot(label = 'PE_RATIO', color='C0', linestyle = 'dashed')
portf_seq_top3.loc['value'].plot(label = 'Sequential Top3', color='C1', linestyle = 'dashed')
portf_sim_top3.loc['value'].plot(label = "Simultaneous Top3", color='C2')
portf_sim_top6.loc['value'].plot(label = 'Simultaneous Top6', color='C3')
benchmark.loc['value'].plot(label = "benchmark", color='black')

plt.grid()
plt.ylabel("Value (in millions)")
plt.text(508, uni_pe.loc['value'][-1], 'IR: ' + str(round(IR(uni_pe),2)), color='C0')
plt.text(508, portf_seq_top3.loc['value'][-1], 'IR: ' + str(round(IR(portf_seq_top3),2)), color='C1')
plt.text(508, portf_sim_top3.loc['value'][-1], 'IR: ' + str(round(IR(portf_sim_top3),2)), color='C2')
plt.text(508, portf_sim_top6.loc['value'][-1], 'IR: ' + str(round(IR(portf_sim_top6),2)), color='C3')

plt.title("Comparison between univariate, sequential and simultaneous portfolios.")
plt.ylabel("value (in tens of millions)")
plt.legend()
plt.show()


rank.sort_values(by='Information', ascending=False).iloc[1]

Information    1.268581
Sharpe         0.775805
Sortino        0.982374
Name: PE_RATIO, dtype: object


data_univariate = []
for factor in tot_factors:
    data_univariate.append(list(uniPortfolio(get_factor(factor),12).loc['value']))


df_corr1 = np.corrcoef(data_univariate)


plt.imshow(df_corr1, cmap = 'RdBu_r', interpolation='nearest', vmax=1, vmin=0)
# Add legend
plt.colorbar()
labels = tot_factors
# Create list with tick_mark positions
tick_marks = [i for i in range(len(tot_factors))]
# Add the tickmarks at the designated position, using the labels of the dataframe
plt.xticks(tick_marks, labels, rotation='vertical', fontsize=9)
plt.yticks(tick_marks, labels, fontsize=9)
plt.show()


df_corr1 = pd.DataFrame(df_corr1, columns = tot_factors, index = tot_factors)


uncorr_factors = df_corr1.loc[:,'PE_RATIO'][df_corr1.loc[:,'PE_RATIO'] < 0.85]
uncorr_factors

CURRENT_EV_TO_12M_SALES    0.827129
TRAIL_12M_SALES_PER_SH     0.801906
TANG_BOOK_VAL_PER_SH       0.811133
EBITDA_MARGIN_3YR_AVG      0.771817
RSI_14D                    0.750167
RSI_30D                    0.823930
RSI_9D                     0.687097
Name: PE_RATIO, dtype: float64


rank.loc[uncorr_factors.index].sort_values(by='Information', ascending=False)


uncorr_factors = df_corr1.loc[uncorr_factors.index].loc[:,'CURRENT_EV_TO_12M_SALES'][df_corr1.loc[:,'CURRENT_EV_TO_12M_SALES'] < 0.85]
uncorr_factors

TRAIL_12M_SALES_PER_SH    0.829442
EBITDA_MARGIN_3YR_AVG     0.708955
Name: CURRENT_EV_TO_12M_SALES, dtype: float64


rank.loc[uncorr_factors.index].sort_values(by='Information', ascending=False)


plt.imshow(df_corr1.loc[['PE_RATIO', 'CURRENT_EV_TO_12M_SALES', 'EBITDA_MARGIN_3YR_AVG'], ['PE_RATIO', 'CURRENT_EV_TO_12M_SALES', 'EBITDA_MARGIN_3YR_AVG']],
           cmap = 'RdBu_r', interpolation='nearest', vmax=1, vmin=0)
# Add legend
plt.colorbar()
labels = ['PE_RATIO', 'CURRENT_EV_TO_12M_SALES', 'EBITDA_MARGIN_3YR_AVG']
# Create list with tick_mark positions
tick_marks = [i for i in range(3)]
# Add the tickmarks at the designated position, using the labels of the dataframe
plt.xticks(tick_marks, labels, rotation='vertical', fontsize=9)
plt.yticks(tick_marks, labels, fontsize=9)
plt.text(0, 1, '0.827', horizontalalignment="center", color='white')
plt.text(0, 2, '0.772', horizontalalignment="center", color='white')
plt.text(1, 2, '0.709', horizontalalignment="center", color='white')
plt.show()


portf_uncorr_seq1 = seqPortfolio(['PE_RATIO', 'CURRENT_EV_TO_12M_SALES', 'EBITDA_MARGIN_3YR_AVG'], 12)
portf_uncorr_seq_inv1 = seqPortfolio(['EBITDA_MARGIN_3YR_AVG','CURRENT_EV_TO_12M_SALES', 'PE_RATIO'], 12)
portf_uncorr_sim1 = simPortfolio(['PE_RATIO', 'CURRENT_EV_TO_12M_SALES', 'EBITDA_MARGIN_3YR_AVG'], 12)


corr_list = []
# For every stock, build a correlation matrix of the time series of its factor, and append it to the corr_list
for stock in stocks:
    ind = df.index.get_loc(stock + ' Equity')
    corr_list.append(df.iloc[ind+2:ind+42].transpose().astype('float64').corr())
# Calculate the df_corr2 as the average correlation matrix between all the correlation matrixes in the list
df_corr2 = pd.concat(corr_list)
df_corr2 = df_corr2.groupby(level=0).mean()
df_corr2 = df_corr2[list(df_corr2.index)]
df_corr2 = df_corr2.loc[tot_factors,tot_factors]


plt.imshow(df_corr2, cmap = 'RdBu_r', interpolation='nearest', vmax=1, vmin=-1)
# Add legend
plt.colorbar()
labels = tot_factors
# Create list with tick_mark positions
tick_marks = [i for i in range(len(tot_factors))]
# Add the tickmarks at the designated position, using the labels of the dataframe
plt.xticks(tick_marks, labels, rotation='vertical', fontsize=9)
plt.yticks(tick_marks, labels, fontsize=9)
plt.show()


uncorr_factors = df_corr2.loc[:,'PE_RATIO'][df_corr2.loc[:,'PE_RATIO'] < 0.2]
uncorr_factors

0
FIVE_YEAR_AVG_EV_TO_T12_EBITDA    0.122001
TRAIL_12M_EBITDA_PER_SHARE       -0.331919
TRAIL_12M_SALES_PER_SH           -0.190967
NET_DEBT_PER_SHARE                0.020325
TANG_BOOK_VAL_PER_SH             -0.069882
NORMALIZED_ACCRUALS_CF_METHOD    -0.137099
EBITDA_MARGIN                    -0.281752
EBITDA_MARGIN_3YR_AVG            -0.105860
RSI_30D                           0.162066
OPERATING_ROIC                   -0.299678
EQY_DPS_NET_5YR_GROWTH           -0.110507
EQY_REC_CONS                     -0.067301
WACC_COST_EQUITY                 -0.051332
NORMALIZED_ROE                   -0.428298
5YR_AVG_RETURN_ON_EQUITY         -0.251224
CUR_MKT_CAP                       0.138047
Name: PE_RATIO, dtype: float64


rank.loc[uncorr_factors.index].sort_values(by='Information', ascending=False)


uncorr_factors = df_corr2.loc[uncorr_factors.index].loc[:,'FIVE_YEAR_AVG_EV_TO_T12_EBITDA'][df_corr2.loc[:,'FIVE_YEAR_AVG_EV_TO_T12_EBITDA'] < 0.2]
uncorr_factors

0
TRAIL_12M_EBITDA_PER_SHARE      -0.085002
TRAIL_12M_SALES_PER_SH          -0.027029
NET_DEBT_PER_SHARE               0.118059
TANG_BOOK_VAL_PER_SH            -0.085059
NORMALIZED_ACCRUALS_CF_METHOD    0.063244
EBITDA_MARGIN                   -0.094127
EBITDA_MARGIN_3YR_AVG           -0.113233
RSI_30D                          0.004878
OPERATING_ROIC                  -0.040279
EQY_DPS_NET_5YR_GROWTH           0.023646
EQY_REC_CONS                     0.034376
WACC_COST_EQUITY                 0.026049
NORMALIZED_ROE                   0.029498
5YR_AVG_RETURN_ON_EQUITY        -0.017209
CUR_MKT_CAP                      0.169720
Name: FIVE_YEAR_AVG_EV_TO_T12_EBITDA, dtype: float64


rank.loc[uncorr_factors.index].sort_values(by='Information', ascending=False)


plt.imshow(df_corr2.loc[['PE_RATIO', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'NORMALIZED_ACCRUALS_CF_METHOD'], ['PE_RATIO', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'NORMALIZED_ACCRUALS_CF_METHOD']],
           cmap = 'RdBu_r', interpolation='nearest', vmax=1, vmin=-1)
# Add legend
plt.colorbar()
labels = ['PE_RATIO', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'NORMALIZED_ACCRUALS_CF_METHOD']
# Create list with tick_mark positions
tick_marks = [i for i in range(3)]
# Add the tickmarks at the designated position, using the labels of the dataframe
plt.xticks(tick_marks, labels, rotation='vertical', fontsize=9)
plt.yticks(tick_marks, labels, fontsize=9)
plt.text(0, 1, '0.122', horizontalalignment="center")
plt.text(0, 2, '-0.137', horizontalalignment="center")
plt.text(1, 2, '0.162', horizontalalignment="center")
plt.show()


portf_uncorr_seq2 = seqPortfolio(['PE_RATIO', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12)
portf_uncorr_seq_inv2 = seqPortfolio(['NORMALIZED_ACCRUALS_CF_METHOD', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'PE_RATIO'], 12)
portf_uncorr_sim2 = simPortfolio(['PE_RATIO', 'FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12)


plt.figure(figsize =(10, 6))

portf_seq_top3.loc['value'].plot(label = 'Portf Seq top3', linestyle = 'dashed', color='C1')
portf_uncorr_seq1.loc['value'].plot(label = 'Portf Seq Uncorrelated 1', color='mediumblue')
portf_uncorr_seq_inv1.loc['value'].plot(label = 'Portf Seq Uncorrelated Inv 1', color='royalblue')
portf_uncorr_seq2.loc['value'].plot(label = 'Portf Seq Uncorrelated 2', color='darkgreen')
portf_uncorr_seq_inv2.loc['value'].plot(label = 'Portf Seq Uncorrelated Inv 2', color='limegreen')
benchmark.loc['value'].plot(label = "benchmark", color='black')
plt.grid()
plt.title("Comparison of Sequential Portfolios")
plt.ylabel("Value (in millions)")
plt.text(508, portf_seq_top3.loc['value'][-1], 'IR: ' + str(round(IR(portf_seq_top3),2)), color='C1')
plt.text(508, portf_uncorr_seq1.loc['value'][-1], 'IR: ' + str(round(IR(portf_uncorr_seq1),2)), color='mediumblue')
plt.text(508, portf_uncorr_seq_inv1.loc['value'][-1], 'IR: ' + str(round(IR(portf_uncorr_seq_inv1),2)), color='royalblue')
plt.text(508, portf_uncorr_seq2.loc['value'][-1] + 50000, 'IR: ' + str(round(IR(portf_uncorr_seq2),2)), color='darkgreen')
plt.text(508, portf_uncorr_seq_inv2.loc['value'][-1] - 50000, 'IR: ' + str(round(IR(portf_uncorr_seq_inv2),2)), color='limegreen')
plt.legend()
plt.show()


plt.figure(figsize =(10, 6))

portf_sim_top6.loc['value'].plot(label = "Simultaneous Top6", color = 'C3', linestyle = 'dashed')
portf_uncorr_sim1.loc['value'].plot(label = "Simultaneous Uncorrelated 1", color = 'mediumblue')
portf_uncorr_sim2.loc['value'].plot(label = "Simultaneous Uncorrelated 2", color = 'darkgreen')
benchmark.loc['value'].plot(label = "benchmark", color='black')
plt.title("Comparison of Sequential Portfolios")
plt.ylabel("Value (in millions)")
plt.grid()
plt.text(508, portf_sim_top6.loc['value'][-1], 'IR: ' + str(round(IR(portf_sim_top6),2)), color='C3')
plt.text(508, portf_uncorr_sim1.loc['value'][-1], 'IR: ' + str(round(IR(portf_uncorr_sim1),2)), color='mediumblue')
plt.text(508, portf_uncorr_sim2.loc['value'][-1], 'IR: ' + str(round(IR(portf_uncorr_sim2),2)), color='darkgreen')
plt.legend()
plt.show()


def weightPortfolio(data, holding, weight, wealth = 600000, n_stocks=15):
    # Initialize worst list
    worst = []
    for factor in range(len(data)):
        worst.append(diz_factors[data[factor]])
    # Initiate dataset
    sub = pd.DataFrame(columns=time, index = stocks)
    # Set i = 1, we want to start at month 2, so that we can use the information of the factor at month 1
    i=1
    # Remove the transaction cost in the first iteration of wealth
    wealth = wealth * 0.998
    # Initialize weight dataset
    weight = get_factor(weight).replace(0, np.nan)**(-1)
    # Loop through i. i is going to represent the month in which the selected stocks are modified based on the factor
    while i < len(time):
        # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
        if i != 1:
            portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
            # Overwrite the wealth as the portfolio total value
            wealth = portfT.sum()
        # Create a copy of the factors
        fac = []
        for el in range(len(data)):
            fac.append(get_factor(data[el]).loc[price_mul.iloc[:,i].notnull(),:])
        # Select stocks based on factor
        sub_factor = pd.DataFrame(columns=range(len(data)+1), index = fac[0].index)
        for el in range(len(data)):
            new_col = scale(fac[el].iloc[:,i-1])
            if worst[el]:
                new_col = new_col * -1
            sub_factor[el] = new_col
        sub_factor.iloc[:,-1] = sub_factor.mean(axis=1)
        sel_stocks = sub_factor.sort_values(sub_factor.columns[-1], ascending=False).index[:n_stocks]
        # Get sum of the weights
        tot_weight = weight.loc[sel_stocks, weight.columns[i-1]].sum()
        sub_weight = weight.loc[sel_stocks, weight.columns[i-1]] / tot_weight
        # Allocate wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth * sub_weight
        # If we are not in the first month, than we have to compute the transaction costs
        if i != 1:
            # We compute the transaction costs based on the difference between the portfolio before and after reallocation
            trans_cost = abs(sub.iloc[:,i] - portfT).sum() * 0.002
            # Remove the transaction cost from total wealth
            wealth = wealth - trans_cost
            # Overwrite allocation of wealth equally through the selected stocks
            sub.loc[sel_stocks, sub.columns[i]] = wealth * sub_weight
        # J is going to loop through the months in which there is no reallocation
        for j in range(1,holding):
            # Break the j loop if the months are finished
            if i + j > len(time)-1:
                break
            # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
            sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
        # Increment i by the holding period before next iteration
        i += holding
    # Since we started from i=1, the first element of the subset is going to be 0, so we are going to remove it
    # Build output dataframe
    portf = pd.DataFrame(columns=time[1:])
    # Insert value information
    portf.loc['value'] = sub.sum()[1:]
    # Insert return information    
    portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
    portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
    portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))
    return portf


portf_weight_180 = weightPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'CURRENT_EV_TO_T12M_EBITDA', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12, 'VOLATILITY_180D')
portf_weight_90 = weightPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'CURRENT_EV_TO_T12M_EBITDA', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12, 'VOLATILITY_90D')
portf_weight_30 = weightPortfolio(['PE_RATIO','CURRENT_EV_TO_12M_SALES','FIVE_YEAR_AVG_EV_TO_T12_EBITDA', 'CURRENT_EV_TO_T12M_EBITDA', 'PX_TO_BOOK_RATIO', 'NORMALIZED_ACCRUALS_CF_METHOD'], 12, 'VOLATILITY_30D')


plt.figure(figsize =(10, 6))

portf_weight_180.loc['value'].plot(label = "Weighted 180D", color = 'C0')
portf_weight_90.loc['value'].plot(label = "Weighted 90D", color = 'C1')
portf_weight_30.loc['value'].plot(label = "Weighted 30D", color = 'C2')
portf_sim_top6.loc['value'].plot(label = "Simultaneous top6", color = 'C3', linestyle = 'dashed')
benchmark.loc['value'].plot(label = "Benchmark", color='black')
plt.title("Comparison between weighted and non-weighted simultaneous portfolios")
plt.text(508, portf_weight_180.loc['value'][-1], 'IR: ' + str(round(IR(portf_weight_180),2)), color='C0')
plt.text(508, portf_weight_90.loc['value'][-1], 'IR: ' + str(round(IR(portf_weight_90),2)), color='C1')
plt.text(508, portf_weight_30.loc['value'][-1], 'IR: ' + str(round(IR(portf_weight_30),2)), color='C2')
plt.text(508, portf_sim_top6.loc['value'][-1], 'IR: ' + str(round(IR(portf_sim_top6),2)), color='C3')
plt.grid()
plt.legend()
plt.ylabel("value (in tens millions)")
plt.show()


# Initialize an empty rank dataframe
rank_weight = pd.DataFrame(columns = ['Information', 'Sharpe', 'Sortino'],
                    index = ['portf_weight_180', 'portf_weight_90', 'portf_weight_30', 'portf_sim_top6'])


# For each portfolio insert in the dataframe its resulting metrics
port_list = [portf_weight_180, portf_weight_90, portf_weight_30, portf_sim_top6]
for i in range(4):
    rank_weight.iloc[i]['Information'] = IR(port_list[i])
    rank_weight.iloc[i]['Sharpe'] = Sharpe(port_list[i])
    rank_weight.iloc[i]['Sortino'] = Sortino(port_list[i])


rank_weight


from prophet import Prophet


sel_factors = list(rank.index[0:5])


wealth = 600000
skip_count = 0
i = 12
sub = pd.DataFrame(columns=time, index = stocks)
while i < len(time) - 12:
    # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
    if i != 12:
        portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
        # Overwrite the wealth as the portfolio total value
        wealth = portfT.sum()
    stock_rank = {}
    # For each stock, build the prophet prediction
    for stock in stocks:
        # Get the stock data
        ind = df.index.get_loc(stock + ' Equity')
        df_stock = df.iloc[range(ind + 2, ind + 42)].transpose()
        # Change the data shape and names in order to be used as input by Prophet
        df_stock['ds'] = df.loc['Date'].iloc[0]
        df_stock.index = range(len(df_stock))
        df_stock = df_stock.rename(columns = {'PX_LAST':'y'})
        # If the stock has no information about the value of the factors that are going to be used as explanatory variable, skip this stock
        if df_stock[sel_factors].isnull().sum().sum() != 0:
            skip_count += 1
            continue
        # The train dataset utilizes all the factor values available until before the past year
        train = df_stock[:i]
        # + the price in the subsequent year of the factors (until past year)
        train['y'] =  df_stock.loc[range(12,i+12),'y'].values.tolist()
        # The test dataset utilizes all the factor values available in the past year
        test = pd.DataFrame(df_stock[i:i+12])
        m = Prophet()
        # This loop add every selected factor as explanatory variable in the model
        for factor in sel_factors:
            m.add_regressor(factor)
        # Train the data
        m.fit(train)
        # Build the prediction
        prediction = m.predict(test)
        # Compute the predicted return
        stock_rank[stock] = (prediction.iloc[-1]['yhat'] - train.iloc[-1]['y']) / train.iloc[-1]['y']
    # The selected stocks are going to be those with the best predicted return
    sel_stocks = pd.DataFrame(sorted(stock_rank.items(), key=lambda x:x[1], reverse=True)[:15])[0]
    # Allocate wealth equally through the selected stocks
    sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
    # If we are not in the first month, than we have to compute the transaction costs
    if i != 12:
        # We compute the transaction costs based on the difference between the portfolio before and after reallocation
        trans_cost = abs(sub.iloc[:,i] - portfT).sum() * 0.002
        # Remove the transaction cost from total wealth
        wealth = wealth - trans_cost
        # Overwrite allocation of wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
    # J is going to loop through the months in which there is no reallocation
    for j in range(1,12):
        # Break the j loop if the months are finished
        if i + j > len(time)-1:
            break
        # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
        sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
    # Increment i by the holding period before next iteration
    i += 12
portf = pd.DataFrame(columns=time[12:])
# Insert value information
portf.loc['value'] = sub.sum()
# Insert return information    
portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))


nan_factors = []
for factor_name in sel_factors:
    factor = get_factor(factor_name)
    print(factor_name, factor.isnull().any(axis=1).sum())

CUR_MKT_CAP 14
PE_RATIO 453
CURRENT_EV_TO_12M_SALES 415
FIVE_YEAR_AVG_EV_TO_T12_EBITDA 526
CURRENT_EV_TO_T12M_EBITDA 498


prop_portf = portf


# Build a benchmark that start at the same month
sub = pd.DataFrame(columns=time, index = stocks)
i=13
# Remove the transaction cost in the first iteration of wealth
wealth = 600000 * 0.998
# To calculate the benchmark we select only the stocks that have no null information in the price df
sel_stocks = price[~price.isnull().any(axis=1)].index
# Allocate wealth equally through the selected stocks
sub.loc[sel_stocks, sub.columns[12]] = wealth/len(sel_stocks)
# Calculate the value of the portfolio at each iteration of time
while i < len(time):
    sub.iloc[:,i] = sub.iloc[:,i-1] * price_mul.iloc[:,i]
    i += 1
# Build output dataframe
bench = pd.DataFrame(columns=time[12:])
# Insert value information
bench.loc['value'] = sub.sum()[1:]
# Insert return information
bench.loc['returns'] = (bench.loc['value'].astype('float') - bench.loc['value'].shift(1).astype('float'))/bench.loc['value'].shift(1).astype('float')
bench.loc['cum_returns'] = (bench.loc['value'].astype('float') - bench.loc['value'][0])/bench.loc['value'][0]
bench.loc['log_returns'] = np.log(bench.loc['value'].astype('float')) - np.log(bench.loc['value'].shift(1).astype('float'))


bench_mom = bench


plt.figure(figsize =(10, 6))

prop_portf_CURMKTCAP.loc['value'][:-12].plot(label = "Prophet CURMKTCAP", color = 'C0')
prop_portf.loc['value'][:-12].plot(label = "Prophet", color = 'C1')
bench_mom.loc['value'][:-12].plot(label = 'benchmark', color = 'black')
plt.title("Results of the Prophet Portfolio")
plt.ylabel("Value (in milions)")
plt.grid()
plt.legend()
plt.show()


wealth = 600000
i = 6
wealth = wealth * 0.998
sub = pd.DataFrame(columns=time, index = stocks)
while i < len(time):
    # If we are not in the first month, we have to compute the portfolio value in the new month before reallocation
    if i != 6:
        portfT = sub.iloc[:,i-1] * price_mul.iloc[:,i]
        # Overwrite the wealth as the portfolio total value
        wealth = portfT.sum()
    momentum_diz = {}
    # For each stock, we compute its momentum
    for stock in stocks:
        # The momentum is computed as [price(time-6) - price(time-2)] / price(time-6)
        momentum = (price.loc[stock,price.columns[i-2]] - price.loc[stock,price.columns[i-6]]) / price.loc[stock,price.columns[i-6]]
        # We save the momentum in a dictionary
        momentum_diz[stock] = momentum
    # We select the 15 stocks with the best momentum
    sel_stocks = pd.DataFrame(sorted(momentum_diz.items(), key=lambda x:x[1], reverse=True)[:15])[0]
    # Allocate wealth equally through the selected stocks
    sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
    # If we are not in the first month, than we have to compute the transaction costs
    if i != 6:
        # We compute the transaction costs based on the difference between the portfolio before and after reallocation
        trans_cost = abs(sub.iloc[:,i] - portfT).sum() * 0.002
        # Remove the transaction cost from total wealth
        wealth = wealth - trans_cost
        # Overwrite allocation of wealth equally through the selected stocks
        sub.loc[sel_stocks, sub.columns[i]] = wealth/len(sel_stocks)
    # J is going to loop through the months in which there is no reallocation
    for j in range(1,3):
        # Break the j loop if the months are finished
        if i + j > len(time)-1:
            break
        # Compute the value of all the stocks at time t as their value at time t-1 times theprice multiplicator at time t
        sub.iloc[:,i+j] = sub.iloc[:,i+j-1] * price_mul.iloc[:,i+j]
    # Increment i by the holding period before next iteration
    i += 3
portf = pd.DataFrame(columns=time[6:])
# Insert value information
portf.loc['value'] = sub.sum()
# Insert return information    
portf.loc['returns'] = (portf.loc['value'].astype('float') - portf.loc['value'].shift(1).astype('float'))/portf.loc['value'].shift(1).astype('float')
portf.loc['cum_returns'] = (portf.loc['value'].astype('float') - portf.loc['value'][0])/portf.loc['value'][0]
portf.loc['log_returns'] = np.log(portf.loc['value'].astype('float')) - np.log(portf.loc['value'].shift(1).astype('float'))


momentum_portf = portf


# Build a benchmark that start at the same month
sub = pd.DataFrame(columns=time, index = stocks)
i=7
# Remove the transaction cost in the first iteration of wealth
wealth = 600000 * 0.998
# To calculate the benchmark we select only the stocks that have no null information in the price df
sel_stocks = price[~price.isnull().any(axis=1)].index
# Allocate wealth equally through the selected stocks
sub.loc[sel_stocks, sub.columns[6]] = wealth/len(sel_stocks)
# Calculate the value of the portfolio at each iteration of time
while i < len(time):
    sub.iloc[:,i] = sub.iloc[:,i-1] * price_mul.iloc[:,i]
    i += 1
# Build output dataframe
bench = pd.DataFrame(columns=time[6:])
# Insert value information
bench.loc['value'] = sub.sum()[1:]
# Insert return information
bench.loc['returns'] = (bench.loc['value'].astype('float') - bench.loc['value'].shift(1).astype('float'))/bench.loc['value'].shift(1).astype('float')
bench.loc['cum_returns'] = (bench.loc['value'].astype('float') - bench.loc['value'][0])/bench.loc['value'][0]
bench.loc['log_returns'] = np.log(bench.loc['value'].astype('float')) - np.log(bench.loc['value'].shift(1).astype('float'))


bench_mom = bench


plt.figure(figsize =(10, 6))

momentum_portf.loc['value'].plot(label = 'momentum', color='C1')
bench_mom.loc['value'].plot(label = 'benchmark', color='black')

plt.text(508, momentum_portf.loc['value'][-1], 'IR: ' + str(round(IR(momentum_portf),2)), color='C1')

plt.grid()
plt.title('Comparison of Momentum Portfolio with Benchmark')
plt.ylabel('value (in tens of milions)')
plt.legend()
plt.show()

	1	2	3	4	5	6	7	8	9	10	...	102	103	104	105	106	107	108	109	110	111
0
ABI BB Equity	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Date	2003-01-31 00:00:00	2003-02-28 00:00:00	2003-03-31 00:00:00	2003-04-30 00:00:00	2003-05-30 00:00:00	2003-06-30 00:00:00	2003-07-31 00:00:00	2003-08-29 00:00:00	2003-09-30 00:00:00	2003-10-31 00:00:00	...	2011-06-30 00:00:00	2011-07-29 00:00:00	2011-08-31 00:00:00	2011-09-30 00:00:00	2011-10-31 00:00:00	2011-11-30 00:00:00	2011-12-30 00:00:00	2012-01-31 00:00:00	2012-02-29 00:00:00	2012-03-30 00:00:00
PE_RATIO	19.5111	16.8231	19.8095	21.8594	20.6439	19.2732	19.3167	21.3866	21.6364	20.2557	...	19.2185	19.1443	18.3465	17.713	18.5708	19.7267	16.705	16.5097	18.3504	19.8999
FIVE_YR_AVG_PRICE_EARNINGS	0	0	0	0	0	0	0	0	0	0	...	18.7176	18.7176	18.7176	18.7176	18.7176	18.7176	18.7709	18.7709	18.7709	18.7709
T12M_DIL_PE_CONT_OPS	0	0	0	0	0	0	0	0	0	0	...	18.836	18.7632	17.9813	17.3605	18.2012	19.334	16.3761	16.1847	17.9892	19.5082
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
NORMALIZED_ROE	0	0	0	0	0	0	0	0	0	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5YR_AVG_RETURN_ON_EQUITY	0	0	0	0	0	0	0	0	0	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
CUR_MKT_CAP	0	0	0	0	0	0	0	0	0	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NORMALIZED_ACCRUALS_BS_METHOD	0	0	0	0	0	0	0	0	0	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
PX_TO_BOOK_RATIO	0	0	0	0	0	0	0	0	0	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Date	2003-01-31	2003-02-28	2003-03-31	2003-04-30	2003-05-30	2003-06-30	2003-07-31	2003-08-29	2003-09-30	2003-10-31	...	2011-06-30	2011-07-29	2011-08-31	2011-09-30	2011-10-31	2011-11-30	2011-12-30	2012-01-31	2012-02-29	2012-03-30
ABI BB	NaN	-0.153347	0.152804	0.074728	-0.109818	0.076775	0.024005	0.125930	-0.050171	-0.059747	...	-0.046884	0.004988	-0.044508	0.033880	0.011001	0.096745	0.065631	-0.020287	0.084059	0.082937
FP FP	NaN	-0.024176	-0.055337	0.012849	0.057868	0.055463	0.001517	0.058213	-0.075046	0.031146	...	-0.003754	-0.054095	-0.105566	-0.022610	0.129102	0.013657	0.029937	0.022653	0.038478	-0.093549
UNA NA	NaN	0.005720	0.036402	0.034241	-0.127350	-0.061837	0.075821	0.005935	-0.003955	-0.010547	...	-0.003312	0.001989	0.039603	0.008030	0.049882	0.012932	0.049176	-0.043067	-0.020644	0.023195
SAN FP	NaN	0.019742	-0.073364	0.147717	0.016698	-0.063619	-0.017805	0.021719	0.019343	0.019915	...	0.007605	-0.020961	-0.069203	-0.026199	0.051536	0.002499	0.085683	-0.004946	-0.017146	0.047838
ENI IM	NaN	-0.024254	-0.116373	0.045515	0.070347	-0.040761	-0.004947	0.048619	-0.047704	0.040117	...	-0.021232	-0.071800	-0.080921	-0.057326	0.190233	-0.019578	0.020828	0.054100	0.023971	0.016046

Date	2003-02-28	2003-03-31	2003-04-30	2003-05-30	2003-06-30	2003-07-31	2003-08-29	2003-09-30	2003-10-31	2003-11-28	...	2011-06-30	2011-07-29	2011-08-31	2011-09-30	2011-10-31	2011-11-30	2011-12-30	2012-01-31	2012-02-29	2012-03-30
value	598800.0	587425.533415	660207.337435	692876.772973	731648.311379	773002.994227	821343.08105	823181.517205	900586.823293	929618.039072	...	2302978.120544	2203851.081235	2021796.168429	1872358.936239	1986169.576593	1907954.125365	1919944.106303	2045750.950641	2142830.946715	2152527.775027
returns	NaN	-0.018995	0.1239	0.049484	0.055957	0.056523	0.062535	0.002238	0.094032	0.032236	...	-0.017381	-0.043043	-0.082608	-0.073913	0.060785	-0.03938	0.006284	0.065526	0.047454	0.004525
cum_returns	0.019363	0.0	0.1239	0.179514	0.245517	0.315917	0.398208	0.401338	0.533108	0.582529	...	2.92046	2.751711	2.441791	2.187398	2.381143	2.247993	2.268404	2.482571	2.647834	2.664342
log_returns	NaN	-0.019178	0.116804	0.048298	0.054448	0.054983	0.060658	0.002236	0.08987	0.031727	...	-0.017533	-0.043997	-0.08622	-0.076787	0.059009	-0.040176	0.006265	0.063469	0.046363	0.004515

	Information	Sharpe	Sortino
CUR_MKT_CAP	2.572272	0.964028	1.774545
PE_RATIO	1.268581	0.775805	0.982374
CURRENT_EV_TO_12M_SALES	1.190999	0.694106	0.894214
FIVE_YEAR_AVG_EV_TO_T12_EBITDA	1.158677	0.756905	0.924165
CURRENT_EV_TO_T12M_EBITDA	1.107566	0.72325	0.874313
PX_TO_BOOK_RATIO	0.890569	0.567376	1.038024
NORMALIZED_ACCRUALS_CF_METHOD	0.887866	0.667879	1.106075
5YR_AVG_RETURN_ON_EQUITY	0.852702	0.560658	0.861664
TRAIL_12M_EBITDA_PER_SHARE	0.735485	0.576269	1.07594
EBITDA_MARGIN_3YR_AVG	0.668972	0.583426	0.969576
TRAIL_12M_SALES_PER_SH	0.597328	0.588323	0.883077
EBITDA_MARGIN	0.578179	0.545594	1.023334
RSI_9D	0.532972	0.850518	1.085326
PX_TO_TANG_BV_PER_SH	0.488429	0.584849	0.87474
RSI_30D	0.453949	0.516281	0.768311
EQY_DPS_NET_5YR_GROWTH	0.388422	0.704911	1.085478
FIVE_YR_AVG_PRICE_EARNINGS	0.334923	0.6265	0.7239
RSI_14D	0.306011	0.482653	0.85147
NET_DEBT_PER_SHARE	0.142135	0.846456	1.029318
WACC_COST_EQUITY	-0.047826	0.889192	1.347257
OPERATING_ROIC	-0.212811	0.722881	0.958099
EQY_REC_CONS	-0.225417	0.702182	0.759324
NORMALIZED_ROE	-0.287531	0.62834	0.76751
TANG_BOOK_VAL_PER_SH	-0.93403	0.45706	0.514472

Date	2003-02-28	2003-03-31	2003-04-30	2003-05-30	2003-06-30	2003-07-31	2003-08-29	2003-09-30	2003-10-31	2003-11-28	...	2011-06-30	2011-07-29	2011-08-31	2011-09-30	2011-10-31	2011-11-30	2011-12-30	2012-01-31	2012-02-29	2012-03-30
value	598800.0	604790.262009	661637.714469	713975.581809	737923.832265	787567.109524	843784.158616	846232.486133	956529.571639	962561.761368	...	6194101.956307	6030898.120106	5520898.077803	5091280.32388	5350267.86917	4957809.266168	4945702.428624	5252104.850658	5554369.514622	5807879.492871
returns	NaN	0.010004	0.093995	0.079104	0.033542	0.067274	0.071381	0.002902	0.130339	0.006306	...	-0.022299	-0.026348	-0.084565	-0.077817	0.050869	-0.073353	-0.002442	0.061953	0.057551	0.045642
cum_returns	0.0	0.010004	0.104939	0.192344	0.232338	0.315242	0.409125	0.413214	0.597411	0.607485	...	9.344192	9.07164	8.219937	7.502472	7.934983	7.279575	7.259356	7.77105	8.275834	8.699198
log_returns	NaN	0.009954	0.089836	0.076131	0.032992	0.065108	0.068948	0.002897	0.122518	0.006287	...	-0.022552	-0.026702	-0.088355	-0.081011	0.049617	-0.076183	-0.002445	0.06011	0.055956	0.044631

Financial Markets Analytics Project

Formigoni Alberto, Giardini Davide, Haardt Vittorio

Assumptions¶

Preliminary Operations on Data¶

Importation¶

Factor Selection¶

Price Extraction¶

Benchmark¶

UniVariate Screening Models¶

Function for UniVariate Screening¶

Building the necessary metrics¶

Information Ratio¶

Sharpe Ratio¶

Sortino Ratio¶

Choosing the Parameters¶

Top-Bottom¶

Holding Period¶

Univariate Screening Models Portfolios¶

A deeper look into Current Market Cap¶

Sequential Screening¶

Simultaneous Screening¶

Accounting for correlation between factors¶

Correlation Between Univariate Portfolios¶

Correlation Between Factors¶

Results¶

Weighted Portfolio¶

Prophet Portfolio¶

Momentum Portfolio¶

	Information	Sharpe	Sortino
portf_seq_top3	1.092642	0.729568	0.895137
portf_seq_top3_inv	0.841387	0.657162	0.803223

Date	2003-02-28	2003-03-31	2003-04-30	2003-05-30	2003-06-30	2003-07-31	2003-08-29	2003-09-30	2003-10-31	2003-11-28	...	2011-06-30	2011-07-29	2011-08-31	2011-09-30	2011-10-31	2011-11-30	2011-12-30	2012-01-31	2012-02-29	2012-03-30
value	598800.0	617508.808839	706541.893857	710354.620817	756534.472363	848790.42887	884299.704912	880158.792739	968782.255865	1016708.20419	...	3959043.090802	3900238.491907	3510312.977577	3278671.988075	3390207.807767	3244822.484614	3303015.975812	3384307.127542	3671914.783331	3826073.925732
returns	NaN	0.031244	0.144181	0.005396	0.06501	0.121945	0.041835	-0.004683	0.10069	0.04947	...	-0.063493	-0.014853	-0.099975	-0.065989	0.034019	-0.042884	0.017934	0.024611	0.084983	0.041983
cum_returns	0.0	0.031244	0.17993	0.186297	0.263418	0.417486	0.476786	0.469871	0.617873	0.697909	...	5.611628	5.513424	4.862246	4.475404	4.66167	4.418875	4.516059	4.651816	5.132122	5.389569
log_returns	NaN	0.030766	0.134689	0.005382	0.062984	0.115064	0.040984	-0.004694	0.095938	0.048286	...	-0.065598	-0.014965	-0.105333	-0.068267	0.033453	-0.043831	0.017775	0.024313	0.081564	0.041126

Date	2003-02-28	2003-03-31	2003-04-30	2003-05-30	2003-06-30	2003-07-31	2003-08-29	2003-09-30	2003-10-31	2003-11-28	...	2011-06-30	2011-07-29	2011-08-31	2011-09-30	2011-10-31	2011-11-30	2011-12-30	2012-01-31	2012-02-29	2012-03-30
value	598800.0	596411.815668	686730.452715	695754.610035	774412.47938	862734.482066	1039841.880571	999939.47258	1186294.118391	1175397.848545	...	13343887.837536	12947041.015565	11874416.913412	11222909.600969	11364509.042881	10892290.292138	11110460.672337	11517024.695496	12198350.325132	12524348.917149
returns	NaN	-0.003988	0.151437	0.013141	0.113054	0.11405	0.205286	-0.038374	0.186366	-0.009185	...	-0.056139	-0.02974	-0.082847	-0.054866	0.012617	-0.041552	0.02003	0.036593	0.059158	0.026725
cum_returns	0.0	-0.003988	0.146844	0.161915	0.293274	0.440772	0.736543	0.669906	0.981119	0.962922	...	21.284382	20.621645	18.830356	17.742334	17.978806	17.190198	17.554544	18.233508	19.371327	19.915746
log_returns	NaN	-0.003996	0.14101	0.013055	0.107108	0.108002	0.186717	-0.039129	0.170895	-0.009228	...	-0.057776	-0.030191	-0.086481	-0.056429	0.012538	-0.04244	0.019832	0.035939	0.057474	0.026374

	Information	Sharpe	Sortino
portf_weight_180	1.022047	0.705464	0.969694
portf_weight_90	0.981541	0.698841	0.948778
portf_weight_30	1.053439	0.71037	1.028956
portf_sim_top6	1.138483	0.696275	1.12746